setwd("~/Dropbox/research/losingtouch/replication") # Remember to set this to your working directory

library(tidyverse)
library(viridis)
library(scales)
library(stm)
library(quanteda)
library(tidytext)
library(kableExtra)
library(ggrepel)

# Define colors
show_col(viridis(9))
vircol1 <- viridis(9)[1]
vircol2 <- viridis(9)[2]
vircol3 <- viridis(9)[3]
vircol4 <- viridis(9)[4]
vircol5 <- viridis(9)[5]
vircol6 <- viridis(9)[6]
vircol7 <- viridis(9)[7]

#Load collapsed speech covariates
pdfcol <- read_rds("11-pqmpgcol.rds") 

#Load STM model
stmmodel <- read_rds("13-stm.rds")

# Load DFM + covariates
pardfmr <- read_rds("13-pardfm.rds")

# Convert to STM format
dfm2stm <- convert(pardfmr,to="stm")

# Examine FREX scores
stmlabs <- labelTopics(stmmodel)

# Manually label topics
topiclabeldf <- tibble(topicnum=1:30,
                       topiclabel=c("Judicial system","Refugee policy","Public spending","Churches","Covid-19",
                                    "Immigration","Municipalities","Labor & Unemployment","Taxes","EU Treaties",
                                    "Housing","Schools","Tax law","Health","Youth",
                                    "Defense","Welfare vs. Taxes","International conflict","Crime","Unemployment",
                                    "Business regulation","Police","Iraq war","Public transport","Social policy",
                                    "US Foreign Policy","Culture","Agriculture","Labor market reform","Environmental regulation"))

# Extract vector of keywords by topic
getkeywordlist <- function(topicnum){
  frexvec <- paste(stmlabs$frex[topicnum,],collapse=", ")
  liftvec <- paste(stmlabs$lift[topicnum,],collapse=", ")
  scorevec <- paste(stmlabs$score[topicnum,],collapse=", ")
  fulltopicvec <- str_glue("FREX: {frexvec}. Lift: {liftvec}. Score: {scorevec}.")
  topicdfi <- tibble(keywords=fulltopicvec)
  return(topicdfi)
}

keyworddf <- topiclabeldf %>% 
  bind_cols(map_dfr(1:30,getkeywordlist))

textopictab <- keyworddf %>% 
  kbl(caption = "Structural Topic Model labels and keywords",format = "latex",col.names = c("No.","Label","Keywords"),
      align=c("r","l","p{12cm}"),vline="",booktabs = T,longtable = T) 

write_file(textopictab,"tables/app-topictab.tex")

# Estimate effects
stmeffs <- estimateEffect(c(1:30)~minister,stmobj = stmmodel,
                          meta=dfm2stm$meta,uncertainty = "Global")

#Tidy estimates
tidyests <- tidy(stmeffs) %>% 
  filter(term=="minister") %>% 
  mutate(topicnum=1:30) %>% 
  left_join(topiclabeldf,by="topicnum") %>% 
  arrange(-estimate) %>% 
  mutate(labfac=fct_inorder(topiclabel),
         rank=1:30)


ggplot(data=tidyests,aes(x=estimate,y=labfac,color=rank)) +
  theme_bw() +
  geom_vline(xintercept = 0,linetype=2,alpha=.4) +
  geom_errorbarh(aes(xmin=estimate-1.96*std.error,xmax=estimate+1.96*std.error),height=0,size=.5) +
  geom_errorbarh(aes(xmin=estimate-1.65*std.error,xmax=estimate+1.65*std.error),height=0,size=1) +
  geom_point() +
  scale_color_viridis_c(end=.8) +
  labs(x="Effect of govt. membership on prevalence",y="") +
  theme(legend.position="none") 

ggsave("figures/topicsplot_pres.pdf",width=7,height=3)
ggsave("figures/topicsplot.pdf",width=6,height=4)

# Extract topic proportion ests for including as covariates ----
str(stmmodel)

#prevalence matrix
prevmat <- stmmodel$theta

#get id for each document in the STM
docdf <- as_tibble(docvars(pardfmr)) %>% 
  mutate(spm=str_glue("{speaker}-{minister}"))

#define same id in pdfcol
pdfcol <- pdfcol %>% 
  mutate(spm=str_glue("{speaker}-{minister}"))

#index of which spm in pdfcol are in docdf
spmindex <- match(pdfcol$spm,docdf$spm) %>% na.omit()

#reduce pdfcol to only those in docdf
pdfcol <- pdfcol[spmindex,] 

#for each topic, regress prevalence on simplicity
lixvec <- pdfcol$avglixrevrs
rarvec <- pdfcol$avgrarrevrs

#function for regressing topic prevalence on simplicity
prevsimlm <- function(col) {
  tivec <- prevmat[,col]
  tilm_lix <- lm(tivec~lixvec) 
  tilm_rar <- lm(tivec~rarvec) 
  tilmdf <- bind_rows(broom::tidy(tilm_lix),broom::tidy(tilm_rar)) %>% 
    filter(term!="(Intercept)") %>% 
    mutate(topic=col,dv=c("LIX","Rare words"))
  return(tilmdf)
}

prevsimassocs <- map_dfr(1:30,prevsimlm) %>% 
  rename(simassoc=estimate) %>% 
  left_join(tidyests,by="topic") %>% 
  transmute(simassoc,govassoc=estimate,dv,topiclabel,labfac,rank)

ggplot(subset(prevsimassocs,dv=="LIX"),aes(govassoc,simassoc,label=topiclabel)) +
  geom_smooth(alpha=0,linetype="dashed",method="lm",color=vircol5,size=1) +
  geom_point(color=vircol3) +
  geom_text_repel(alpha=.7) +
  theme_bw() +
  labs(x="Correlation w. govt.",y="Correlation w. simplicity")

ggsave("figures/topicassoc-lix.pdf",width=5,height=5)

ggplot(subset(prevsimassocs,dv=="Rare words"),aes(govassoc,simassoc,label=topiclabel)) +
  geom_smooth(alpha=0,linetype="dashed",method="lm",color=vircol5,size=1) +
  geom_point(color=vircol3) +
  geom_text_repel(alpha=.7) +
  theme_bw() +
  labs(x="Correlation w. govt.",y="Correlation w. simplicity")

ggsave("figures/topicassoc-rar.pdf",width=5,height=5)

lm(simassoc~govassoc,data = subset(prevsimassocs,dv=="LIX")) %>% summary() 
lm(simassoc~govassoc,data = subset(prevsimassocs,dv=="Rare words")) %>% summary() 
